Differential expression analysis of LCM RNA Data

Managing Packages Using Renv

To run this code in my project using the renv environment, run the following lines of code

install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile

Install packages

renv::install("bioc::genefilter")
renv::install("DESeq2")
renv::install("bioc::apeglm")
renv::install("ashr")
renv::install("ggplot2")
renv::install("vsn")
renv::install("hexbin")
renv::install("pheatmap")
renv::install("RColorBrewer")
renv::install("bioc::EnhancedVolcano")
renv::install("tidyverse")
library(genefilter)
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 4.3.3
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.3.3
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(apeglm)
library(ashr)
library(ggplot2)
library(vsn)
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.3.3
library(pheatmap)
library(RColorBrewer)
library(EnhancedVolcano)
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ✖ readr::spec()         masks genefilter::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             EnhancedVolcano_1.20.0     
## [11] ggrepel_0.9.6               RColorBrewer_1.1-3         
## [13] pheatmap_1.0.12             hexbin_1.28.4              
## [15] vsn_3.70.0                  ggplot2_3.5.1              
## [17] ashr_2.2-63                 apeglm_1.24.0              
## [19] DESeq2_1.42.1               SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0              MatrixGenerics_1.12.3      
## [23] matrixStats_1.4.1           GenomicRanges_1.52.1       
## [25] GenomeInfoDb_1.36.4         IRanges_2.34.1             
## [27] S4Vectors_0.38.2            BiocGenerics_0.46.0        
## [29] genefilter_1.84.0          
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3               bitops_1.0-8            rlang_1.1.4            
##  [4] magrittr_2.0.3          compiler_4.3.2          RSQLite_2.3.7          
##  [7] png_0.1-8               vctrs_0.6.5             pkgconfig_2.0.3        
## [10] crayon_1.5.3            fastmap_1.2.0           XVector_0.40.0         
## [13] utf8_1.2.4              rmarkdown_2.28          tzdb_0.4.0             
## [16] preprocessCore_1.64.0   bit_4.5.0               xfun_0.47              
## [19] zlibbioc_1.46.0         cachem_1.1.0            jsonlite_1.8.9         
## [22] blob_1.2.4              DelayedArray_0.26.7     BiocParallel_1.34.2    
## [25] irlba_2.3.5.1           parallel_4.3.2          R6_2.5.1               
## [28] stringi_1.8.4           bslib_0.8.0             limma_3.58.1           
## [31] SQUAREM_2021.1          jquerylib_0.1.4         numDeriv_2016.8-1.1    
## [34] Rcpp_1.0.13             knitr_1.48              timechange_0.3.0       
## [37] Matrix_1.6-5            splines_4.3.2           tidyselect_1.2.1       
## [40] rstudioapi_0.16.0       abind_1.4-8             yaml_2.3.10            
## [43] codetools_0.2-20        affy_1.80.0             lattice_0.22-6         
## [46] plyr_1.8.9              withr_3.0.1             KEGGREST_1.40.1        
## [49] coda_0.19-4.1           evaluate_1.0.0          survival_3.7-0         
## [52] Biostrings_2.68.1       pillar_1.9.0            affyio_1.72.0          
## [55] BiocManager_1.30.25     renv_1.0.10             generics_0.1.3         
## [58] invgamma_1.1            RCurl_1.98-1.16         truncnorm_1.0-9        
## [61] emdbook_1.3.13          hms_1.1.3               munsell_0.5.1          
## [64] scales_1.3.0            xtable_1.8-4            glue_1.8.0             
## [67] tools_4.3.2             annotate_1.78.0         locfit_1.5-9.10        
## [70] mvtnorm_1.3-1           XML_3.99-0.17           grid_4.3.2             
## [73] bbmle_1.0.25.1          bdsmatrix_1.3-7         AnnotationDbi_1.62.2   
## [76] colorspace_2.1-1        GenomeInfoDbData_1.2.10 cli_3.6.3              
## [79] fansi_1.0.6             mixsqp_0.3-54           S4Arrays_1.0.6         
## [82] gtable_0.3.5            sass_0.4.9              digest_0.6.37          
## [85] memoise_2.0.1           htmltools_0.5.8.1       lifecycle_1.0.4        
## [88] httr_1.4.7              statmod_1.5.0           bit64_4.5.2            
## [91] MASS_7.3-60.0.1

Read and clean count matrix and metadata

Read in raw count data

counts_raw <- read.csv("../output_RNA/stringtie/LCM_RNA_gene_count_matrix.csv") #load in data
rownames(counts_raw) <- counts_raw[,1] #set first column that contains gene names as rownames
counts_raw <- counts_raw[,-1] #remove the column with gene names

gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9

Read in metadata

meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv")
meta <- dplyr::arrange(meta, Sample)

# Set variables as factors 
meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline

meta$Fragment <- factor(meta$Fragment)
meta$Section_Date <- factor(meta$Section_Date)
meta$LCM_Date <- factor(meta$LCM_Date)

Data sanity checks!

all(meta$Sample %in% colnames(counts_raw)) #are all of the sample names in the metadata column names in the gene count matrix? Should be TRUE
## [1] TRUE
all(meta$Sample == colnames(counts_raw)) #are they the same in the same order? Should be TRUE
## [1] TRUE

pOverA filtering to reduce dataset

ffun<-filterfun(pOverA(0.25,5))  #set up filtering parameters
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter
sum(counts_filt_poa) #count number of genes left
## [1] 15530
filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter

There are now 15530 genes in the filtered dataset.

Data sanity checks:

all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts))  #are they the same in the same order? Should be TRUE
## [1] TRUE

Following DESeq2 Vignette

Create DESeq object and run DESeq2

dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = meta,
                              design= ~ Fragment + Tissue)

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Extract results for Aboral vs. OralEpi contrast

res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the moving direction increases the objective function value
EnhancedVolcano(res, 
    lab = rownames(res),
    x = 'log2FoldChange',
    y = 'pvalue')

EnhancedVolcano(resLFC, 
    lab = rownames(resLFC),
    x = 'log2FoldChange',
    y = 'pvalue')

Extract results for adjusted p-value < 0.05

res <- resLFC

resOrdered <- res[order(res$pvalue),]
# save differentially expressed genes

DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi

DEGs <- rownames(DE_05)

nrow(DE_05)
## [1] 4177
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 649
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 3528
write.csv(as.data.frame(resOrdered), 
          file="../output_RNA/differential_expression/DESeq_results.csv")

write.csv(DE_05, 
          file="../output_RNA/differential_expression/DEG_05.csv")

Plots

plotMA(res, ylim=c(-20,20))

plotMA(resLFC, ylim=c(-20,20))

#After calling plotMA, one can use the function identify to interactively detect the row number of individual genes by clicking on the plot. One can then recover the gene identifiers by saving the resulting indices:

#idx <- identify(res$baseMean, res$log2FoldChange)
#rownames(res)[idx]
# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-5,5)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")

plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")

Transforming count data for visualization

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)

meanSdPlot(assay(vsd), main = "vsd")
## Warning in geom_hex(bins = bins, ...): Ignoring unknown parameters: `main`

meanSdPlot(assay(rld))

meanSdPlot(assay(ntd))

Will move forward with vst transformation for visualizations

Heatmap of count matrix

df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

#view most significantly differentially expressed genes

select <- order(res$padj)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

## Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Principal component plot of the samples

pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

Clearly, the majority of the variance in the data is explained by tissue type!

Annotation data

Download annotation files from genome website


# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)

CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)

KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
## Joining with `by = join_by(query)`
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())
## Joining with `by = join_by(query)`
annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)
## Joining with `by = join_by(query)`
eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_eggnog), 
          file="../output_RNA/differential_expression/DE_05_eggnog_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
  separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 13646 rows [1, 2, 3, 4,
## 5, 7, 8, 10, 11, 12, 13, 14, 15, 18, 19, 21, 22, 23, 24, 25, ...].
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)

# Extract the relevant assay data for the selected genes
selected_data <- assay(vsd)[select, ]

# Normalize via z-score (scale across rows)
zscore_data <- t(scale(t(selected_data)))

pheatmap(zscore_data, 
         cluster_rows = FALSE, 
         show_rownames = TRUE, 
         cluster_cols = TRUE, 
         cutree_cols = 2, 
         annotation_col = df, 
         labels_row = gene_labels[select, "PFAMs"], 
         fontsize_row = 5)

we probably want to correct for batch effects (LCM date) + genotype (Fragment)

and to decide which tranformation to use

Updating Renv environment:

After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.”